Packages

Click to expand
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(glue)
library(ggplot2)
library(ggnetwork)
library(igraph)
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union


Functions

Click to expand
# load data functions
read_snp_csv <- function(distance_matrix_csv_path){
    tryCatch({
        snp_data <- read_csv(distance_matrix_csv_path, show_col_types = F)
    }, error = function(e) {
        stop("Error reading the SNP data file. Please check that it is a valid CSV file.")
    })
    # check valid matrix
    if(nrow(snp_data) + 1 != ncol(snp_data)){
        stop('Number of rows and cols in snp data must be the same')
    }
    if(!'Name' %in% names(snp_data)){
        stop('Name column not in snp_data, is it from Pathogenwatch?')
    }
    return(
        snp_data %>% 
            pivot_longer(cols = !Name, values_to = 'dist', names_to = 'iso2') %>%
            rename(iso1=Name)
    )
}

read_kleborate_data_csv <- function(kleborate_data_path){
    tryCatch({
        kleborate_data <- read_csv(kleborate_data_path, show_col_types = F)
    }, error = function(e) {
        stop("Error reading the kleborate data file. Please check that it is a valid CSV file.")
    })
    # TODO: validate kleborate data columns
    return(kleborate_data)
}

read_metadata_csv <- function(metadata_path){
    tryCatch({
        metadata <- read_csv(metadata_path, show_col_types = F)
    }, error = function(e) {
        stop("Error reading the metadata file. Please check that it is a valid CSV file.")
    })
    if (! all(REQUIRED_METADATA_COLS %in% names(metadata)) ) {
        stop(paste0("The following columns are required in the metadata file: ", 
                    paste(REQUIRED_METADATA_COLS, collapse = ", ")))
    }
    return(metadata)
}

format_sample_dates <- function(metadata){
    if(! all(c('id', 'Year', 'Month', 'Day') %in% names(metadata)) ) {
        stop(paste0("'id', 'Year', 'Month', and 'Day' columns are required."))
    }
    sample_dates <- metadata %>% dplyr::select(id, Year, Month, Day) %>% 
        dplyr::filter(!(is.na(Year) | is.na(Month) | is.na(Day))) %>% 
        dplyr::rowwise() %>%  
        dplyr::mutate(formatted_date = paste0(Year, "-", Month, "-", Day)) %>% 
        dplyr::mutate(formatted_date = lubridate::as_date(formatted_date)) %>% 
        # dplyr::mutate(decimal_date = lubridate::decimal_date(formatted_date)) %>% 
        dplyr::select(id, formatted_date) %>% 
        dplyr::ungroup()
    return(sample_dates)
}

# cluster functions

get_snp_and_epi_data <- function(snp_data, sample_dates, metadata, 
                                 geo_column = "Country"){
    if (length(geo_column) > 1){stop("Can only supply one geo_column")}
    if(!geo_column %in% names(metadata)){
        stop(glue::glue("The specified '{geo_column}' column is missing from metadata"))
    }
    geo_data <- metadata %>% select(id, all_of(geo_column))
    colnames(geo_data) <- c("id", "geo")
    snp_and_epi_data <- snp_data %>% 
        # days between isolation
        dplyr::left_join(sample_dates, by = c('iso1' = 'id')) %>% dplyr::rename('iso1_date' = 'formatted_date') %>% 
        dplyr::left_join(sample_dates, by = c('iso2' = 'id')) %>% dplyr::rename('iso2_date' = 'formatted_date') %>% 
        dplyr::mutate(days = abs(floor(as.numeric(
            difftime(iso1_date, iso2_date, units = "days")
        )))) %>% 
        dplyr::select(-c(iso1_date, iso2_date)) %>% 
        # check isolates with same or different location
        dplyr::left_join(geo_data, by = c('iso1' = 'id')) %>% dplyr::rename('geo1' = 'geo') %>% 
        dplyr::left_join(geo_data, by = c('iso2' = 'id')) %>% dplyr::rename('geo2' = 'geo') %>% 
        dplyr::mutate(pair_location = if_else(geo1 == geo2, "Same", "Different")) %>% 
        dplyr::select(-c(geo1, geo2))
    return(snp_and_epi_data)
}

#' Create a graph from table with distances and a geographic column 
#' @param distance_data A tibble with columns: 'iso1', 'iso2', 
#' and one or more columns containing distances (e.g., SNPs)
#' between the isolates in the 'iso1' and 'iso2' columns
#' @param dist_columns Vector of names of each of the columns containing distances 
#' e.g., dist_columns = c('SNPs', 'days')
#' @param dist_thresholds Vector of distance thresholds [int] to be applied for each distance column
#' Length of this vector must be equal to the length of dist_columns
#' e.g., dist_threshold = c(10, 14)
#' @param pair_location_column Name of the geographic column. Column must contain either of the
#' values "Same" or "Different" representing isolate pairs from the same or different locations
#' @export
get_cluster_graph <- function(snp_and_epi_data, dist_columns, dist_thresholds, 
                              pair_location_column = "pair_location", directed=FALSE) {
    if (length(dist_columns) != length(dist_thresholds)) {
        stop("The lengths of vars `dist_columns` and `dist_thresholds` must be equal")
    }
    if(!all(dist_columns %in% names(snp_and_epi_data))){
        stop('One or more specified columns missing from snp_and_epi_data')
    }
    if(!pair_location_column %in% names(snp_and_epi_data)){
        stop(glue::glue("The specified '{pair_location_column}' column is missing from snp_and_epi_data"))
    }
    if(!tibble::is_tibble(snp_and_epi_data)){stop('snp_and_epi_data is not a tibble')}
    # filter out pairs based on the threshold for each distance column
    for (i in seq(1:length(dist_columns))) {
        snp_and_epi_data %<>% 
            dplyr::filter(!!rlang::sym(dist_columns[i]) <= dist_thresholds[i]) %>% 
            dplyr::select(-c(!!rlang::sym(dist_columns[i]))) 
    }
    # filter out pairs with same isolate 
    # filter out pairs belonging to the same geo location
    # generate graph
    snp_and_epi_data %>% 
        dplyr::filter(! iso1 == iso2) %>% 
        dplyr::filter(!!rlang::sym(pair_location_column) == "Same") %>% 
        dplyr::select(c(iso1, iso2)) %>% 
        as.matrix() %>% 
        igraph::graph_from_edgelist(., directed = directed)
}


# adapted from KleborateR function; removed dependency on kleborate data
get_cluster_membership_from_graph <- function(cluster_graph) {
    if(!igraph::is_igraph(cluster_graph)){stop('cluster_graph is not an igraph')}
    igraph::components(cluster_graph)$membership %>%
        tibble::as_tibble(rownames = "id") %>%
        dplyr::mutate(Cluster = as.character(value)) %>%
        dplyr::select(!value)
}

calculate_cluster_proportion <- function(clusters_data){
    clusters_data %>% 
        dplyr::summarise(
            cluster_proportion = round(sum(!is.na(Cluster)) / n(), 2)
        ) %>% dplyr::pull(cluster_proportion)
}

# plotting functions

plot_dist_distribution <- function(distance_data, dist_column, x_label = "Pairwise distance", 
                                   plot_title = "Distribution of pairwise distances",
                                   scale_y = F, facet_plot = F, facet_column = NULL){
    if(! all(c('iso1', 'iso2') %in% colnames(distance_data)) ) {
        stop("'iso1' and 'iso2' columns required")
    }
    if(! dist_column %in% colnames(distance_data) ) {
        stop(glue::glue("'{dist_column}' column missing from distance_data"))
    }
    if(facet_plot){
        if(is.null(facet_column)){
            stop("'facet_plot' set to TRUE but facet column not provided")
        }
        if(! facet_column %in% colnames(distance_data) ) {
            stop(glue::glue("'{facet_column}' column missing from distance_data"))
        }
    }
    # plot
    dist_plot <- ggplot(distance_data, aes(x = !!rlang::sym(dist_column))) +
        ggplot2::geom_histogram(binwidth = 10, color = "black", alpha = 0.8) +
        ggplot2::labs(title = plot_title, x = x_label, y = "Number of isolate pairs") +
        ggplot2::theme_minimal() +
        ggplot2::theme(axis.text = element_text(size = 14)) +
        ggplot2::theme(axis.title = element_text(size = 14)) +
        ggplot2::theme(legend.text = element_text(size = 14))
    if (scale_y){
        dist_plot <- dist_plot + scale_y_log10(labels = scales::comma_format())
    }
    if (facet_plot){
        dist_plot <- dist_plot + 
            facet_wrap(~pair_location, nrow=2, scales="free_y")
    }
    return(dist_plot)
}

# Adapted from KleborateR
plot_transmission_network <- function(snp_graph, kleborate_data, var1) {
    snp_graph %>% 
        dplyr::left_join(
            kleborate_data %>% dplyr::select(all_of(c('Genome Name', var1))),
            by=c('name'='Genome Name')) %>%
        ggplot2::ggplot(aes(x=x, y=y, xend=xend, yend=yend, label=name)) +
        ggnetwork::geom_edges() +
        ggnetwork::geom_nodes(
            aes(col=.data[[var1]]), 
            size=3.5, shape=16, alpha=.5
        ) +
        ggnetwork::theme_blank() +
        ggplot2::guides(col="none", fill="none")
}

# TO DO. Add options for customising plot
plot_clusters <- function(clusters_data, min_cluster_size = 3) {
    # filter
    clusters_data %<>% 
        dplyr::filter(!is.na(Cluster)) %>% 
        dplyr::add_count(Cluster, name = "cluster_size") %>% 
        dplyr::filter(cluster_size >= min_cluster_size)
    # Calculate point sizes based on the number of isolates on each date per cluster
    size_scale_factor <- clusters_data %>%
        dplyr::group_by(Cluster, formatted_date) %>%
        dplyr::reframe(size_scale = n())
    # merge
    clusters_data %<>% dplyr::left_join(size_scale_factor, by = c("Cluster", "formatted_date"))
    # plot
    ggpubr::ggscatter(
        clusters_data, x = "formatted_date", y = "ST", 
        color = "Cluster", palette = "viridis", ellipse = TRUE, ellipse.type = "convex",
        # shape = "Country",
        size = clusters_data$size_scale,  # Use the size scaling factor
        legend = "right", ggtheme = theme_bw(), 
        ylab = "ST",
        xlab = "Isolation date"
    ) + ggplot2::scale_x_date(labels = scales::date_format("%Y-%m"), date_breaks = "4 weeks")
}


Set up

# Data
DEMO_SNP_DATA <- "data/demo_data/euscape/pw-euscape-difference-matrix.csv"
DEMO_METADATA <- "data/demo_data/euscape/pw-euscape-metadata.csv"
DEMO_KLEBORATE_DATA <- "data/demo_data/euscape/pw-euscape-kleborate.csv"
# Thresholds
snp_distance_threshold <- 10
temporal_distance_threshold <- 14 # days
# Specs ---------------------------------------------------
REQUIRED_METADATA_COLS <- c('id', 'Year', 'Month', 'Day', 'Country') 

Load data

# load data
snp_data <- read_snp_csv(DEMO_SNP_DATA)
metadata <- read_metadata_csv(DEMO_METADATA)
kleborate_data <- read_kleborate_data_csv(DEMO_KLEBORATE_DATA)
# format sample dates
sample_dates <- format_sample_dates(metadata)

Get clusters

# get pairwise snp distance, days between isolates, and shared location
snp_and_epi_data <- get_snp_and_epi_data(snp_data, sample_dates, metadata, 
                                              geo_column = "Country")
# get cluster graph based on snp distance, days between isolates, and shared location
epi_snp_graph <- get_cluster_graph(snp_and_epi_data, 
                                   dist_column = c('dist', 'days'), 
                                   pair_location_column = "pair_location",
                                   dist_threshold = c(snp_distance_threshold, temporal_distance_threshold))
epi_snp_clusters <- get_cluster_membership_from_graph(epi_snp_graph) %>% 
    dplyr::right_join(metadata, by = 'id') %>% 
    dplyr::left_join(sample_dates, by = 'id') %>% 
    dplyr::left_join(kleborate_data, by = c('id' = 'Genome Name'))

Cluster proportion

cluster_proportion <- calculate_cluster_proportion(epi_snp_clusters)

Proportion of cases part of a transmission cluster = 0.31

Plots

# plot snp distance distribution
plot_dist_distribution(snp_and_epi_data, dist_column = "dist", x_label = "Pairwise distance (SNPs)")

# plot temporal distance distribution
plot_dist_distribution(snp_and_epi_data, dist_column = "days", x_label = "Pairwise temporal distance (days)")

# plot clusters
min_cluster_size <- 5
plot_clusters(epi_snp_clusters, min_cluster_size = min_cluster_size)

# plot transmission network
user_network <- ggnetwork::ggnetwork(epi_snp_graph)
## Warning in format_fortify(model = model, nodes = nodes, weights = "none", :
## duplicated edges detected
tranmission_plot <- plot_transmission_network(user_network, kleborate_data, "ST")
plotly::ggplotly(tranmission_plot)